package spark.code.study.sparksql

import org.apache.spark.sql.SparkSession

/**
  * Created by peibin on 2017/2/3.
  */
object TaobaoBuyer {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("wow")
      .master("local[*]")
      .getOrCreate()

    // For implicit conversions like converting RDDs to DataFrames
    import org.apache.spark.sql.functions.input_file_name
    import spark.implicits._


    val buyer = spark.read.textFile("""file:///Users/peibin/workspace/data-mining/淘宝邮箱数据买家邮箱（2200万）/*/*.txt""")
      .select(input_file_name, $"value")
      .as[(String, String)]
      .map(tuple => {
        val path = tuple._1
        val categories = path.split("/").takeRight(2)
        val email = tuple._2.split("=").last
        val emailDomain = email.split('@').last
        Buyer(email, emailDomain, categories(0), categories(1))
      })

    buyer.printSchema()
    buyer.show(10)
    buyer.createOrReplaceTempView("buyer")
    val sql ="""select emailDomain,count(1) as cnt from buyer group by emailDomain order by cnt desc"""

    spark.sql(sql).show(100)


  }

  case class Buyer(email: String, emailDomain: String, category: String, subCategory: String);
}
